R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

options(java.parameters='-Xmx8g')
library(parallelMap)
parallelStartSocket(4)
## Starting parallelization in mode=socket with cpus=4.
library(data.table)
library(xgboost)
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:xgboost':
## 
##     slice
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(mlr)
## Loading required package: ParamHelpers
library(rgdal)
## Loading required package: sp
## rgdal: version: 1.3-9, (SVN revision 794)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 2.1.3, released 2017/20/01
##  Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/3.5/Resources/library/rgdal/gdal
##  GDAL binary built with GEOS: FALSE 
##  Loaded PROJ.4 runtime: Rel. 4.9.3, 15 August 2016, [PJ_VERSION: 493]
##  Path to PROJ.4 shared files: /Library/Frameworks/R.framework/Versions/3.5/Resources/library/rgdal/proj
##  Linking to sp version: 1.3-1
library(GISTools) 
## Loading required package: maptools
## Checking rgeos availability: TRUE
## Loading required package: RColorBrewer
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: rgeos
## rgeos version: 0.4-2, (SVN revision 581)
##  GEOS runtime version: 3.6.1-CAPI-1.10.1 
##  Linking to sp version: 1.3-1 
##  Polygon checking: TRUE
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:GISTools':
## 
##     map.scale
library(ggplot2)
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(rsample)
## Loading required package: tidyr
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday,
##     week, yday, year
## The following object is masked from 'package:base':
## 
##     date
library(raster)
## 
## Attaching package: 'raster'
## The following object is masked from 'package:tidyr':
## 
##     extract
## The following objects are masked from 'package:MASS':
## 
##     area, select
## The following object is masked from 'package:mlr':
## 
##     resample
## The following object is masked from 'package:ParamHelpers':
## 
##     getValues
## The following object is masked from 'package:dplyr':
## 
##     select
## The following object is masked from 'package:data.table':
## 
##     shift
a<-fread("~/Documents/challenge/new-york-city-taxi-fare-prediction/train.csv",nrows=1000000)
b<-fread("~/Documents/challenge/new-york-city-taxi-fare-prediction/test.csv")
summarizeColumns(a)
##                name      type na       mean      disp    median        mad
## 1               key character  0         NA  0.999999        NA         NA
## 2       fare_amount   numeric  0  11.348079  9.822090   8.50000 4.44780000
## 3   pickup_datetime character  0         NA  0.999991        NA         NA
## 4  pickup_longitude   numeric  0 -72.526640 12.057937 -73.98179 0.01769928
## 5   pickup_latitude   numeric  0  39.929008  7.626154  40.75270 0.02353183
## 6 dropoff_longitude   numeric 10 -72.527860 11.324494 -73.98014 0.01911071
## 7  dropoff_latitude   numeric 10  39.919954  8.201418  40.75317 0.02478314
## 8   passenger_count   integer  0   1.684924  1.323911   1.00000 0.00000000
##         min        max   nlevs
## 1     1.000    1.00000 1000000
## 2   -44.900  500.00000       0
## 3     1.000    9.00000  861755
## 4 -3377.681 2522.27133       0
## 5 -3116.285 2621.62843       0
## 6 -3383.297   45.58162       0
## 7 -3114.339 1651.55343       0
## 8     0.000  208.00000       0
# deal with NA vaule of train
apply(a, 2, function(x){sum(is.na(x))})
##               key       fare_amount   pickup_datetime  pickup_longitude 
##                 0                 0                 0                 0 
##   pickup_latitude dropoff_longitude  dropoff_latitude   passenger_count 
##                 0                10                10                 0
a<-na.omit(a)
##########################################################
##google map
register_google(key = "AIzaSyCUxIR2wAfQ6C8qpcIRwSPVigHK1skRuS8")
newyork_map<-get_googlemap(center = c(lon=-74,lat=40.7),zoom = 10,maptype = 'roadmap')
## Source : https://maps.googleapis.com/maps/api/staticmap?center=40.7,-74&zoom=10&size=640x640&scale=2&maptype=roadmap&key=xxx
ggmap(newyork_map)

ggmap(newyork_map)+geom_point(data=a,aes(x=pickup_longitude,y=pickup_latitude,color='red'),alpha=0.08)+labs(title = "location of pickup points")
## Warning: Removed 20291 rows containing missing values (geom_point).

ggmap(newyork_map)+geom_point(data=a,aes(x=dropoff_longitude,y=dropoff_latitude,color='blue'),alpha=0.08)+labs(title = "location of dropoff points")
## Warning: Removed 20302 rows containing missing values (geom_point).

##########################################################
##spatial data analysis and delete some points in the water
##visualization
setwd("~/Documents/challenge/DATApro")
ny_map<- readOGR(dsn="~/Documents/challenge/DATApro",  layer = "Export_Output") 
## OGR data source with driver: ESRI Shapefile 
## Source: "/Users/yangwang/Documents/challenge/DATApro", layer: "Export_Output"
## with 10 features
## It has 55 fields
## Integer64 fields read as strings:  OBJECTID POPULATION POP2010 WHITE BLACK AMERI_ES ASIAN HAWN_PI HISPANIC OTHER MULT_RACE MALES FEMALES AGE_UNDER5 AGE_5_9 AGE_10_14 AGE_15_19 AGE_20_24 AGE_25_34 AGE_35_44 AGE_45_54 AGE_55_64 AGE_65_74 AGE_75_84 AGE_85_UP HOUSEHOLDS HSEHLD_1_M HSEHLD_1_F MARHH_CHD MARHH_NO_C MHH_CHILD FHH_CHILD FAMILIES HSE_UNITS VACANT OWNER_OCC RENTER_OCC
plot(ny_map, axes = TRUE, col = "grey",xlim = c(-74.3,-73.7),
     ylim = c(+40,+41.5))
title('location of pickup points')
box()
points(a$pickup_longitude, a$pickup_latitude,
       col = "red", pch = 20, cex = 0.5)

plot(ny_map, axes = TRUE, col = "grey",xlim = c(-74.3,-73.7),
     ylim = c(+40,+41.5))
title('location of dropoff points')
box()
points(a$dropoff_longitude, a$dropoff_latitude,
       col = "red", pch = 20, cex = 0.5)

##########################################################
## delete  points
sp_pickup <- SpatialPoints(data.frame(a$pickup_longitude, a$pickup_latitude), proj4string=CRS(proj4string(ny_map)))
spdf_pickup <- sp_pickup[ny_map]
plot(ny_map, axes = TRUE, col = "grey",xlim = c(-74.3,-73.7),
     ylim = c(+40,+41.5))
title('location of pickup points')
box()
points(spdf_pickup, col = "red", pch = 20, cex = 0.5)
title('location of pickup points')

x<-spdf_pickup@coords[,1]
y<-spdf_pickup@coords[,2]
xy<-vector()
##congzheli
sp_drop <- SpatialPoints(data.frame(a$dropoff_longitude, a$dropoff_latitude), proj4string=CRS(proj4string(ny_map)))
spdf_drop <- sp_drop[ny_map]
plot(ny_map, axes = TRUE, col = "grey",xlim = c(-74.3,-73.7),
     ylim = c(+40,+41.5))
title('location of dropoff points')
box()
points(spdf_drop, col = "red", pch = 20, cex = 0.5)
title('location of pickup points')

x<-spdf_pickup@coords[,1]
y<-spdf_pickup@coords[,2]
a<-a%>%  
  filter(pickup_longitude > -80 & pickup_longitude < -70) %>%
  filter(pickup_latitude > 35 & pickup_latitude < 45) %>%
  filter(dropoff_longitude > -80 & dropoff_longitude < -70) %>%
  filter(dropoff_latitude > 35 & dropoff_latitude < 45)
##########################################################
## change time
a<-a%>%
  mutate(
    pickup_datetime = ymd_hms(pickup_datetime),
    year = as.numeric(year(pickup_datetime)),
    month = as.numeric(month(pickup_datetime)),
    day = as.numeric(day(pickup_datetime)),
    dayOfWeek = as.numeric(wday(pickup_datetime)),
    hour = as.numeric(hour(pickup_datetime)),
    minute = as.numeric(minute(pickup_datetime))
   )
a<-a[,-c(1,3)]
##########################################################
for(i in 1:10){
dv<-mapdist(c(a$pickup_longitude[i],a$pickup_latitude[i]), c(a$dropoff_longitude[i],a$dropoff_latitude[i]), mode = "driving")
  a$dis[i]=dv$miles
}
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.721319,-73.844311&key=xxx
## Multiple addresses found, the first will be returned:
##   107-60 Queens Blvd, Forest Hills, NY 11375, USA
##   107-72 Queens Blvd, Forest Hills, NY 11375, USA
##   Forest Hills - 71 Av, Queens, NY 11375, USA
##   Queens Blvd, Forest Hills, NY 11375, USA
##   Forest Hills, Queens, NY 11375, USA
##   Forest Hills, NY 11375, USA
##   Queens, NY, USA
##   Queens County, Queens, NY, USA
##   New York, NY, USA
##   Long Island, New York, USA
##   New York, USA
##   United States
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.712278,-73.84161&key=xxx
## Multiple addresses found, the first will be returned:
##   111 Puritan Avenue Suite 3d, Forest Hills, NY 11375, Forest Hills, NY 11375, United States
##   1-65 Greenway S, Flushing, NY 11375, USA
##   154 Puritan Ave, Forest Hills, NY 11375, USA
##   75 Puritan Ave, Forest Hills, NY 11375, USA
##   1-144 Puritan Ave, Forest Hills, NY 11375, USA
##   Forest Hills, Queens, NY 11375, USA
##   Forest Hills, NY 11375, USA
##   Queens, NY, USA
##   Queens County, Queens, NY, USA
##   New York, NY, USA
##   Long Island, New York, USA
##   New York, USA
##   United States
## Source : https://maps.googleapis.com/maps/api/distancematrix/json?origins=107-60+Queens+Blvd,+Forest+Hills,+NY+11375,+USA&destinations=111+Puritan+Avenue+Suite+3d,+Forest+Hills,+NY+11375,+Forest+Hills,+NY+11375,+United+States&key=xxx&mode=driving
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.711303,-74.016048&key=xxx
## Multiple addresses found, the first will be returned:
##   395 South End Ave, New York, NY 10280, USA
##   389 South End Ave, New York, NY 10280, USA
##   399-375 South End Ave, New York, NY 10280, USA
##   Battery Park City, New York, NY, USA
##   New York, NY 10280, USA
##   Manhattan, New York, NY, USA
##   New York County, New York, NY, USA
##   New York, NY, USA
##   New York, USA
##   United States
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.782004,-73.979268&key=xxx
## Multiple addresses found, the first will be returned:
##   364 Amsterdam Ave, New York, NY 10024, USA
##   Amsterdam Av/W 77 St, New York, NY 10024, USA
##   378-358 Amsterdam Ave, New York, NY 10024, USA
##   New York, NY 10024, USA
##   Upper West Side, New York, NY, USA
##   Manhattan, New York, NY, USA
##   New York County, New York, NY, USA
##   New York, NY, USA
##   New York, USA
##   United States
## Source : https://maps.googleapis.com/maps/api/distancematrix/json?origins=395+South+End+Ave,+New+York,+NY+10280,+USA&destinations=364+Amsterdam+Ave,+New+York,+NY+10024,+USA&key=xxx&mode=driving
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.76127,-73.982738&key=xxx
## Multiple addresses found, the first will be returned:
##   150 W 51st St, New York, NY 10019, USA
##   76179 7th Ave, New York, NY 10019, USA
##   761 7th Ave, New York, NY 10019, USA
##   782-770 7th Ave, New York, NY 10019, USA
##   New York, NY 10020, USA
##   Theater District, New York, NY, USA
##   Midtown Manhattan, New York, NY, USA
##   Manhattan, New York, NY, USA
##   New York County, New York, NY, USA
##   New York, NY, USA
##   New York, USA
##   United States
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.750562,-73.991242&key=xxx
## Multiple addresses found, the first will be returned:
##   420 7th Ave, New York, NY 10119, USA
##   34 St - Penn Station, New York, NY 10120, USA
##   202 W 34th St, New York, NY 10119, USA
##   208 W 33rd St, New York, NY 10001, USA
##   220-200 W 33rd St, New York, NY 10001, USA
##   New York, NY 10119, USA
##   Midtown South, New York, NY, USA
##   Chelsea, New York, NY, USA
##   Manhattan, New York, NY, USA
##   New York County, New York, NY, USA
##   New York, NY, USA
##   New York, USA
##   United States
## Source : https://maps.googleapis.com/maps/api/distancematrix/json?origins=150+W+51st+St,+New+York,+NY+10019,+USA&destinations=420+7th+Ave,+New+York,+NY+10119,+USA&key=xxx&mode=driving
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.733143,-73.98713&key=xxx
## Multiple addresses found, the first will be returned:
##   E 14 St/3 Av, New York, NY 10003, USA
##   200 E 14th St, New York, NY 10003, USA
##   123 3rd Ave, New York, NY 10003, USA
##   111 3rd Ave, New York, NY 10003, USA
##   122 3rd Ave, New York, NY 10003, USA
##   200-298 E 14th St, New York, NY 10003, USA
##   Ukrainian Village, New York, NY, USA
##   New York, NY 10003, USA
##   Manhattan, New York, NY, USA
##   New York County, New York, NY, USA
##   New York, NY, USA
##   New York, USA
##   United States
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.758092,-73.991567&key=xxx
## Multiple addresses found, the first will be returned:
##   345w W 42nd St, New York, NY 10036, USA
##   345 W 42nd St, New York, NY 10036, USA
##   W 42nd St & Port Authority Terminal, W 42nd St, New York, NY 10036, USA
##   343 W 42nd St, New York, NY 10036, USA
##   348-330 W 42nd St, New York, NY 10036, USA
##   New York, NY 10036, USA
##   Midtown South, New York, NY, USA
##   Midtown Manhattan, New York, NY, USA
##   Manhattan, New York, NY, USA
##   New York County, New York, NY, USA
##   New York, NY, USA
##   New York, USA
##   United States
## Source : https://maps.googleapis.com/maps/api/distancematrix/json?origins=E+14+St/3+Av,+New+York,+NY+10003,+USA&destinations=345w+W+42nd+St,+New+York,+NY+10036,+USA&key=xxx&mode=driving
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.768008,-73.968095&key=xxx
## Multiple addresses found, the first will be returned:
##   773 Madison Ave, New York, NY 10065, USA
##   Madison Av/E 66 St, New York, NY 10065, USA
##   21 E 66th St, New York, NY 10065, USA
##   773-791 Madison Ave, New York, NY 10065, USA
##   New York, NY 10065, USA
##   Lenox Hill, New York, NY, USA
##   Central Park West Historic District, New York, NY, USA
##   Manhattan, New York, NY, USA
##   New York County, New York, NY, USA
##   New York, NY, USA
##   New York, USA
##   United States
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.783762,-73.956655&key=xxx
## Multiple addresses found, the first will be returned:
##   1268 Madison Ave, New York, NY 10128, USA
##   46 E 91st St, New York, NY 10128, USA
##   1263 Madison Ave, New York, NY 10128, USA
##   1272-1258 Madison Ave, New York, NY 10128, USA
##   Carnegie Hill, New York, NY, USA
##   New York, NY 10128, USA
##   Upper East Side, New York, NY, USA
##   Manhattan, New York, NY, USA
##   New York County, New York, NY, USA
##   New York, NY, USA
##   New York, USA
##   United States
## Source : https://maps.googleapis.com/maps/api/distancematrix/json?origins=773+Madison+Ave,+New+York,+NY+10065,+USA&destinations=1268+Madison+Ave,+New+York,+NY+10128,+USA&key=xxx&mode=driving
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.73163,-74.000964&key=xxx
## Multiple addresses found, the first will be returned:
##   335 6th Ave, New York, NY 10014, USA
##   333 6th Ave, New York, NY 10014, USA
##   West 4 St-Washington Sq Sta, O, NY 10012, United States
##   339 6th Ave, New York, NY 10014, USA
##   341-321 6th Ave, New York, NY 10014, USA
##   New York, NY 10012, USA
##   West Village, New York, NY, USA
##   Manhattan, New York, NY, USA
##   New York County, New York, NY, USA
##   New York, NY, USA
##   New York, USA
##   United States
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.758233,-73.972892&key=xxx
## Multiple addresses found, the first will be returned:
##   361 Park Ave, New York, NY 10154, USA
##   Park Ave, E 52nd St, New York, NY 10022, United States
##   345 Park Avenue, 345 Park Ave, New York, NY 10154, USA
##   104 E 52nd St, New York, NY 10022, USA
##   100-134 E 52nd St, New York, NY 10022, USA
##   New York, NY 10154, USA
##   Midtown East, New York, NY, USA
##   Midtown Manhattan, New York, NY, USA
##   Manhattan, New York, NY, USA
##   New York County, New York, NY, USA
##   New York, NY, USA
##   New York, USA
##   United States
## Source : https://maps.googleapis.com/maps/api/distancematrix/json?origins=335+6th+Ave,+New+York,+NY+10014,+USA&destinations=361+Park+Ave,+New+York,+NY+10154,+USA&key=xxx&mode=driving
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.751662,-73.980002&key=xxx
## Multiple addresses found, the first will be returned:
##   295 Madison Ave, New York, NY 10017, USA
##   Madison Av & East 40 St, New York, NY 10017, USA
##   287 Madison Ave, New York, NY 10017, USA
##   286 Madison Ave, New York, NY 10017, USA
##   298-282 Madison Ave, New York, NY 10017, USA
##   Murray Hill, New York, NY, USA
##   New York, NY 10017, USA
##   Midtown Manhattan, New York, NY, USA
##   Manhattan, New York, NY, USA
##   New York County, New York, NY, USA
##   New York, NY, USA
##   New York, USA
##   United States
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.764842,-73.973802&key=xxx
## Multiple addresses found, the first will be returned:
##   5 Avenue Station, 1-5, E 59th St, New York, NY 10022, USA
##   2 Central Park S, New York, NY 10019, USA
##   Plaza Food Hall, 1 W 59th St, New York, NY 10019, United States
##   4 W 59th St, New York, NY 10019, USA
##   W 59th St, New York, NY 10019, USA
##   New York, NY 10019, USA
##   Manhattan, New York, NY, USA
##   New York County, New York, NY, USA
##   New York, NY, USA
##   New York, USA
##   United States
## Source : https://maps.googleapis.com/maps/api/distancematrix/json?origins=295+Madison+Ave,+New+York,+NY+10017,+USA&destinations=5+Avenue+Station,+1-5,+E+59th+St,+New+York,+NY+10022,+USA&key=xxx&mode=driving
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.774138,-73.9513&key=xxx
## Multiple addresses found, the first will be returned:
##   1576 1st Avenue, New York, NY 10028, USA
##   1577 1st Avenue, New York, NY 10028, USA
##   1571 1st Avenue, New York, NY 10028, USA
##   1560-1576 1st Avenue, New York, NY 10028, USA
##   Yorkville, New York, NY, USA
##   New York, NY 10028, USA
##   Upper East Side, New York, NY, USA
##   Manhattan, New York, NY, USA
##   New York County, New York, NY, USA
##   New York, NY, USA
##   New York, USA
##   United States
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.751048,-73.990095&key=xxx
## Multiple addresses found, the first will be returned:
##   457 7th Ave, New York, NY 10001, USA
##   34 Street Penn Station, New York, NY 10001, USA
##   179 W 34th St, New York, NY 10001, USA
##   198-168 W 34th St, New York, NY 10001, USA
##   Garment District, New York, NY, USA
##   New York, NY 10001, USA
##   Midtown Manhattan, New York, NY, USA
##   Manhattan, New York, NY, USA
##   New York County, New York, NY, USA
##   New York, NY, USA
##   New York, USA
##   United States
## Source : https://maps.googleapis.com/maps/api/distancematrix/json?origins=1576+1st+Avenue,+New+York,+NY+10028,+USA&destinations=457+7th+Ave,+New+York,+NY+10001,+USA&key=xxx&mode=driving
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.726713,-74.006462&key=xxx
## Multiple addresses found, the first will be returned:
##   74 Charlton St, New York, NY 10014, USA
##   61 Vandam St, New York, NY 10013, USA
##   76-46 Vandam St, New York, NY 10013, USA
##   Hudson Square, New York, NY, USA
##   New York, NY 10013, USA
##   Manhattan, New York, NY, USA
##   New York County, New York, NY, USA
##   New York, NY, USA
##   New York, USA
##   United States
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.731628,-73.993078&key=xxx
## Multiple addresses found, the first will be returned:
##   55 E 9th St, New York, NY 10003, USA
##   40 E 9th St, New York, NY 10003, USA
##   51 E 9th St, New York, NY 10003, USA
##   99-29 E 9th St, New York, NY 10003, USA
##   New York, NY 10003, USA
##   Greenwich Village, New York, NY, USA
##   Manhattan, New York, NY, USA
##   New York County, New York, NY, USA
##   New York, NY, USA
##   New York, USA
##   United States
## Source : https://maps.googleapis.com/maps/api/distancematrix/json?origins=74+Charlton+St,+New+York,+NY+10014,+USA&destinations=55+E+9th+St,+New+York,+NY+10003,+USA&key=xxx&mode=driving
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.733873,-73.980658&key=xxx
## Multiple addresses found, the first will be returned:
##   1 Ave & E 18 St, New York, NY 10009, United States
##   313 1st Avenue, New York, NY 10003, USA
##   310 1st Avenue, New York, NY 10009, USA
##   Stuyvesant Town-Peter Cooper Village, New York, NY, USA
##   New York, NY 10009, USA
##   Midtown Manhattan, New York, NY, USA
##   Manhattan, New York, NY, USA
##   New York County, New York, NY, USA
##   New York, NY, USA
##   New York, USA
##   United States
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.758138,-73.99154&key=xxx
## Multiple addresses found, the first will be returned:
##   345 W 42nd St, New York, NY 10036, USA
##   345w W 42nd St, New York, NY 10036, USA
##   W 42nd St & Port Authority Terminal, W 42nd St, New York, NY 10036, USA
##   343 W 42nd St, New York, NY 10036, USA
##   348-330 W 42nd St, New York, NY 10036, USA
##   New York, NY 10036, USA
##   Midtown South, New York, NY, USA
##   Midtown Manhattan, New York, NY, USA
##   Manhattan, New York, NY, USA
##   New York County, New York, NY, USA
##   New York, NY, USA
##   New York, USA
##   United States
## Source : https://maps.googleapis.com/maps/api/distancematrix/json?origins=1+Ave+&+E+18+St,+New+York,+NY+10009,+United+States&destinations=345+W+42nd+St,+New+York,+NY+10036,+USA&key=xxx&mode=driving
####### example ########
APIdist<-read.csv('/Users/yangwang/Documents/challenge/mapdistance.csv',header = FALSE)
a$dis=APIdist$V2##mile
##########################################################


## distance from airport
ldg <- as.numeric(geocode("LGD, ny"))
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=LGD,+ny&key=xxx
## "LGD, ny" not uniquely geocoded, using "1449 37th st, brooklyn, ny 11218, usa"
jfk<- as.numeric(geocode("JFK, ny"))
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=JFK,+ny&key=xxx
ewr<-as.numeric(geocode("EWR, ny"))
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=EWR,+ny&key=xxx
start<-data.frame(a$pickup_longitude,a$pickup_latitude)
end<-data.frame(a$dropoff_longitude,a$dropoff_latitude)
a$start_lgddis=pointDistance(ldg,start,lonlat=TRUE)
a$end_lgddis=pointDistance(ldg,end,lonlat=TRUE)
a$start_jfkdis=pointDistance(jfk,start,lonlat=TRUE)
a$end_jfkdis=pointDistance(jfk,end,lonlat=TRUE)
a$start_ewrdis=pointDistance(ewr,start,lonlat=TRUE)
a$end_ewrdis=pointDistance(ewr,end,lonlat=TRUE)
a<-a %>%
  mutate(lgds=ifelse(start_lgddis<=10000,1,0))
a<-a%>%
   mutate(lgde=ifelse(end_lgddis<=10000,1,0))
a<-a %>%
  mutate(jfks=ifelse(start_jfkdis<=10000,1,0))
a<-a %>%
   mutate(jfke=ifelse(end_jfkdis<=10000,1,0))
a<-a %>%
  mutate(ewrs=ifelse(start_ewrdis<=10000,1,0))
a<-a %>%
   mutate(ewre=ifelse(end_ewrdis<=10000,1,0))
a<-a[,-c(14:19)]

##########################################################
##split a
same<-vector()
k=1
set.seed(1)
a_split_table<-initial_split(a,prop = 0.8)
a_train<-training(a_split_table)
a_test<-testing(a_split_table)
##########################################################
summary(a_train)
##   fare_amount     pickup_longitude pickup_latitude dropoff_longitude
##  Min.   :-44.90   Min.   :-78.73   Min.   :37.24   Min.   :-78.73   
##  1st Qu.:  6.00   1st Qu.:-73.99   1st Qu.:40.74   1st Qu.:-73.99   
##  Median :  8.50   Median :-73.98   Median :40.75   Median :-73.98   
##  Mean   : 11.33   Mean   :-73.98   Mean   :40.75   Mean   :-73.97   
##  3rd Qu.: 12.50   3rd Qu.:-73.97   3rd Qu.:40.77   3rd Qu.:-73.97   
##  Max.   :500.00   Max.   :-70.26   Max.   :43.21   Max.   :-70.05   
##  dropoff_latitude passenger_count      year          month       
##  Min.   :37.24    Min.   :0.000   Min.   :2009   Min.   : 1.000  
##  1st Qu.:40.74    1st Qu.:1.000   1st Qu.:2010   1st Qu.: 3.000  
##  Median :40.75    Median :1.000   Median :2012   Median : 6.000  
##  Mean   :40.75    Mean   :1.685   Mean   :2012   Mean   : 6.271  
##  3rd Qu.:40.77    3rd Qu.:2.000   3rd Qu.:2013   3rd Qu.: 9.000  
##  Max.   :44.60    Max.   :6.000   Max.   :2015   Max.   :12.000  
##       day          dayOfWeek          hour           minute     
##  Min.   : 1.00   Min.   :1.000   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.: 8.00   1st Qu.:2.000   1st Qu.: 9.00   1st Qu.:15.00  
##  Median :16.00   Median :4.000   Median :14.00   Median :30.00  
##  Mean   :15.69   Mean   :4.122   Mean   :13.51   Mean   :29.57  
##  3rd Qu.:23.00   3rd Qu.:6.000   3rd Qu.:19.00   3rd Qu.:45.00  
##  Max.   :31.00   Max.   :7.000   Max.   :23.00   Max.   :59.00  
##       dis                lgds             lgde             jfks        
##  Min.   :  0.0008   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:  1.1293   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :  1.9378   Median :0.0000   Median :0.0000   Median :0.00000  
##  Mean   :  3.0193   Mean   :0.1569   Mean   :0.1748   Mean   :0.01632  
##  3rd Qu.:  3.5259   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :381.9786   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##       jfke              ewrs                ewre         
##  Min.   :0.00000   Min.   :0.0000000   Min.   :0.000000  
##  1st Qu.:0.00000   1st Qu.:0.0000000   1st Qu.:0.000000  
##  Median :0.00000   Median :0.0000000   Median :0.000000  
##  Mean   :0.01149   Mean   :0.0003037   Mean   :0.001665  
##  3rd Qu.:0.00000   3rd Qu.:0.0000000   3rd Qu.:0.000000  
##  Max.   :1.00000   Max.   :1.0000000   Max.   :1.000000
##########################################################
#####visual######
a_map<-a_train
ggplot(a_train, aes(fare_amount))+
  geom_histogram(fill = "yellow", bins = 50)+
  ggtitle("Distribution of Fare Amount")+
  theme(plot.title = element_text(hjust = .5),)

ggplot(a_train,aes(dis))+
  geom_density(col = "blue")+
  ggtitle("Density of Training Distance")+
  theme(plot.title = element_text(hjust = .5))+
  scale_x_continuous(limits=c(0, 40))
## Warning: Removed 306 rows containing non-finite values (stat_density).

a_map$abslog<-abs(a_map$pickup_longitude-a_map$dropoff_longitude)
a_map$abslat<-abs(a_map$pickup_latitude-a_map$dropoff_latitude)
ggplot(data=a_map,aes(x=abslat,y=abslog,color=fare_amount))+geom_point(size=1)+scale_x_continuous(limits = c(0, 1))+scale_y_continuous(limits = c(0, 1))
## Warning: Removed 142 rows containing missing values (geom_point).

ggplot(data=a_map,aes(x=dis,y=fare_amount))+geom_point(size=1,color='blue')+scale_x_continuous(limits = c(0, 20))+scale_y_continuous(limits = c(0, 150))+labs(title='distance<20 and fare<150')
## Warning: Removed 2013 rows containing missing values (geom_point).

a_map%>%
  filter(dis<30)%>%
  ggplot(aes(dis))+
  geom_density(col = "blue")+
  ggtitle("Density of  Distance")+
  theme(plot.title = element_text(hjust = .5))+
  scale_x_continuous(limits=c(0, 40))

ggplot(data=a_map,aes(x=hour,y=fare_amount))+geom_point(size=1,color='red')+labs(title='relationship between hour and fare')